Load data

#Load dataset
mh <- read.csv("~/Desktop/survey2.csv", header = TRUE, stringsAsFactors = TRUE)

str(mh)
## 'data.frame':    1433 obs. of  63 variables:
##  $ self_employed                           : int  0 0 0 1 0 0 0 0 0 1 ...
##  $ num_employees                           : Factor w/ 7 levels "0","1 to 5","100-500",..: 4 6 6 1 6 7 4 7 4 1 ...
##  $ tech_company                            : Factor w/ 3 levels "0","1","n/a": 2 2 2 3 1 2 2 2 1 3 ...
##  $ primary_role                            : Factor w/ 3 levels "0","1","n/a": 3 3 3 3 2 3 3 3 2 3 ...
##  $ mental_health_coverage                  : Factor w/ 3 levels "No","Yes","n/a": 3 1 1 3 2 2 3 2 3 3 ...
##  $ mental_health_options                   : Factor w/ 4 levels "N/A","No","Yes",..: 4 3 1 4 3 4 2 3 2 4 ...
##  $ mental_health_formally_discussed        : Factor w/ 3 levels "No","Yes","n/a": 1 2 1 3 1 1 1 1 1 3 ...
##  $ mental_health_resources                 : Factor w/ 3 levels "No","Yes","n/a": 1 2 1 3 1 2 1 2 1 3 ...
##  $ anonymity_protected                     : Factor w/ 3 levels "No","Yes","n/a": 3 2 3 3 1 2 3 2 3 3 ...
##  $ medical_leave                           : Factor w/ 6 levels "Neither easy nor difficult",..: 5 3 1 6 1 3 3 5 4 6 ...
##  $ mental_health_negative                  : Factor w/ 3 levels "No","Yes","n/a": 1 1 3 3 2 2 1 1 2 3 ...
##  $ physical_health_negative                : Factor w/ 3 levels "No","Yes","n/a": 1 1 1 3 3 2 1 1 2 3 ...
##  $ mental_health_comfort_coworker          : Factor w/ 3 levels "No","Yes","n/a": 3 3 3 3 3 3 3 3 2 3 ...
##  $ mental_health_comfort_supervisor        : Factor w/ 3 levels "No","Yes","n/a": 2 2 3 3 1 2 2 2 3 3 ...
##  $ mental_health_taken_seriously           : Factor w/ 3 levels "No","Yes","n/a": 3 2 3 3 1 1 2 3 1 3 ...
##  $ coworker_negative_consequences          : Factor w/ 3 levels "No","Yes","n/a": 1 1 1 3 1 2 1 1 1 3 ...
##  $ private_med_coverage                    : Factor w/ 3 levels "0","1","n/a": 3 3 3 2 3 3 3 3 3 2 ...
##  $ resources_awareness                     : Factor w/ 4 levels "I know some",..: 4 4 4 3 4 4 4 4 4 1 ...
##  $ reveal_diagnosis_clients_or_business    : Factor w/ 6 levels "No, because it doesn't matter",..: 6 6 6 4 6 6 6 6 6 1 ...
##  $ revealed_negative_consequences_CB       : Factor w/ 4 levels "N/A","No","Yes",..: 4 4 4 4 4 4 4 4 4 1 ...
##  $ reveal_diagnosis_coworkers              : Factor w/ 5 levels "No, because it doesn't matter",..: 5 5 5 3 5 5 5 5 5 3 ...
##  $ revealed_negative_consequences_CW       : Factor w/ 3 levels "No","Yes","n/a": 3 3 3 3 3 3 3 3 3 1 ...
##  $ productivity_effected                   : Factor w/ 3 levels "No","Yes","n/a": 3 3 3 2 3 3 3 3 3 2 ...
##  $ percentage                              : Factor w/ 5 levels "1-25%","26-50%",..: 5 5 5 1 5 5 5 5 5 1 ...
##  $ previous_employer                       : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ prevemp_mental_health_coverage          : Factor w/ 5 levels "","I don't know",..: 3 5 3 4 2 3 4 4 2 4 ...
##  $ prevemp_mental_health_options           : Factor w/ 5 levels "","I was aware of some",..: 3 2 3 3 3 5 2 2 3 2 ...
##  $ prevemp_mental_health_formally_discussed: Factor w/ 5 levels "","I don't know",..: 2 3 3 3 4 3 3 4 4 3 ...
##  $ prevemp_mental_health_resources         : Factor w/ 4 levels "","None did",..: 2 3 3 2 2 2 3 3 2 2 ...
##  $ prevemp_anonymity_protected             : Factor w/ 5 levels "","I don't know",..: 2 5 2 2 2 2 2 4 2 2 ...
##  $ prevemp_mental_health_negative          : Factor w/ 5 levels "","I don't know",..: 4 3 2 4 4 5 3 4 5 4 ...
##  $ prevemp_physical_health_negative        : Factor w/ 4 levels "","None of them",..: 2 2 3 3 3 3 2 3 4 3 ...
##  $ prevemp_mental_health_coworker          : Factor w/ 4 levels "","No, at none of my previous employers",..: 3 2 3 3 2 2 3 3 2 3 ...
##  $ prevemp_mental_health_comfort_supervisor: Factor w/ 5 levels "","I don't know",..: 4 4 2 4 4 3 5 4 3 4 ...
##  $ prevemp_mental_health_taken_seriously   : Factor w/ 5 levels "","I don't know",..: 2 4 2 2 4 3 4 4 3 2 ...
##  $ prevemp_coworker_negative_consequences  : Factor w/ 4 levels "","None of them",..: 2 2 3 3 3 3 2 3 2 2 ...
##  $ phsyical_issue_interview                : Factor w/ 3 levels "Maybe","No","Yes": 1 1 3 3 1 3 3 2 1 3 ...
##  $ why_physical                            : Factor w/ 1087 levels ""," Don't trust potential employers to not judge.",..: 1 664 907 1065 99 539 328 269 734 202 ...
##  $ mental_health_interview                 : Factor w/ 3 levels "Maybe","No","Yes": 1 2 3 1 2 1 3 2 1 1 ...
##  $ why_mental                              : Factor w/ 1082 levels ""," I don't want to poison the well... people are prejudiced and in denial about their own issues and vulnerabilit"| __truncated__,..: 1 950 848 1079 259 577 359 804 707 562 ...
##  $ career_hurt                             : Factor w/ 5 levels "Maybe","No, I don't think it would",..: 1 2 1 4 4 4 4 1 1 1 ...
##  $ viewed_negatively_by_coworkers          : Factor w/ 5 levels "Maybe","No, I don't think they would",..: 2 2 1 1 1 1 2 1 5 2 ...
##  $ share_with_family                       : Factor w/ 6 levels "Neutral","Not applicable to me (I do not have a mental illness)",..: 5 5 5 1 5 5 2 5 5 6 ...
##  $ observed_poor_handling                  : Factor w/ 5 levels "Maybe/Not sure",..: 3 3 1 3 4 4 3 5 5 3 ...
##  $ observations_lead_less_likely_to_reveal : Factor w/ 5 levels "","Maybe","N/A",..: 1 1 5 1 5 4 1 2 4 1 ...
##  $ family_history                          : Factor w/ 3 levels "I don't know",..: 2 3 2 2 3 2 2 3 3 3 ...
##  $ ever_had_mental_disorder                : Factor w/ 3 levels "Maybe","No","Yes": 3 3 1 3 3 2 2 3 3 3 ...
##  $ currently_have_mental_disorder          : Factor w/ 3 levels "Maybe","No","Yes": 2 3 2 3 3 3 2 3 3 3 ...
##  $ if_yes_what                             : Factor w/ 129 levels "","Addictive Disorder",..: 1 12 1 12 67 68 1 69 85 12 ...
##  $ if_maybe_what                           : Factor w/ 100 levels "","Addictive Disorder",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ medical_prof_diagnosis                  : Factor w/ 2 levels "No","Yes": 2 2 1 2 2 1 1 2 2 2 ...
##  $ what_conditions                         : Factor w/ 117 levels "","ADD (w/o Hyperactivity)",..: 5 11 1 11 5 1 1 54 70 11 ...
##  $ sought_prof_treatment                   : int  0 1 1 1 1 1 0 1 1 1 ...
##  $ treatment_affects_work                  : Factor w/ 5 levels "Never","Not applicable to me",..: 2 4 2 5 5 2 2 5 4 4 ...
##  $ no_treatment_affects_work               : Factor w/ 5 levels "Never","Not applicable to me",..: 2 5 2 5 5 3 2 3 3 3 ...
##  $ age                                     : int  39 29 38 43 43 42 30 37 44 30 ...
##  $ gender                                  : Factor w/ 72 levels ""," Female","AFAB",..: 30 64 31 64 15 30 28 54 15 30 ...
##  $ country                                 : Factor w/ 53 levels "Afghanistan",..: 50 51 50 50 51 50 51 51 51 51 ...
##  $ state                                   : Factor w/ 48 levels "","Alabama","Alaska",..: 1 13 1 1 13 1 41 45 5 17 ...
##  $ country_work                            : Factor w/ 53 levels "Afghanistan",..: 50 51 50 50 51 50 51 51 51 51 ...
##  $ state_work                              : Factor w/ 49 levels "","Alabama","Alaska",..: 1 14 1 1 14 1 42 46 5 18 ...
##  $ work_position                           : Factor w/ 264 levels "Back-end Developer",..: 1 9 1 206 88 66 1 32 255 154 ...
##  $ remote_work                             : Factor w/ 3 levels "Always","Never",..: 3 2 1 3 3 3 3 1 3 1 ...

Cleaning the gender

#head(mh$gender)
#levels(mh$gender)
#summary(mh$gender)

#Standardise gender
female = levels(mh$gender)[grep('(fe).*|^f$|fm|woman|female', levels(mh$gender), ignore.case = T, perl = T)]
male  = levels(mh$gender)[grep('^m$|\bmale| male|mail|male |^male| man|\bman|^man$|masculine|dude|^male$', levels(mh$gender), ignore.case = T, perl = T)]
mh$sex = mh$gender
mh$sex[mh$sex %in% female] = 'female'
mh$sex[mh$sex %in% male] = 'male'
mh$sex[!(mh$sex %in% c('male', 'female'))]  = NA 
summary(mh$sex %in% female)
##    Mode   FALSE    TRUE 
## logical    1088     345
summary(mh$sex %in% male)
##    Mode   FALSE    TRUE 
## logical     375    1058
summary(!(mh$sex %in% c(male, female)))
##    Mode   FALSE    TRUE 
## logical    1403      30
mh$sex = droplevels(mh$sex)
table(mh$sex)
## 
## female   male 
##    345   1058
#Omit the NA data
mh <- mh %>% filter(sex !="NA")

Frequency

Frequency Distribution and Histogram of all factors

Frequency Distribution of Number of employee

# Omit the self-employeed data
noofemployees <- mh %>% filter(num_employees !="0")
# Make them to be ascending order
noofemployees$num_employees <- factor(noofemployees$num_employees, levels = c("1 to 5","6 to 25","26-100","100-500","500-1000","More than 1000"))
# Plot the distribution
noemployees <- ggplot(noofemployees,aes(num_employees))
noemployees + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Number of Employees") + xlab("Number of Employees") + theme_bw()

Frequency Distribution of Tech-company

# Omit the self-employeed data
techcomp <- mh %>% filter(tech_company !="n/a")
# Plot the distribution
techcom <- ggplot(techcomp,aes(tech_company))
techcom + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Tech Company Or Not") + xlab("0 is No, 1 is Yes") + theme_bw() 

Frequency Distribution of Primary Role

# Omit the self-employeed data
primaryrole <- mh %>% filter(primary_role !="n/a")
# Plot the distribution
prirole <- ggplot(primaryrole,aes(primary_role))
prirole + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Primary Role is Tech Or Not") + xlab("0 is No, 1 is Yes") + theme_bw()

Frequency Distribution of Currently Have Mental Disorder Or Not

# Plot the distribution
CHMD <- ggplot(mh,aes(currently_have_mental_disorder))
CHMD + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Currently Have Mental Disorder Or Not") + xlab("") + theme_bw()

Frequency Distribution of Age

summary(mh$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    3.00   28.00   33.00   34.31   39.00  323.00
#Replace the incorrect data with median age
mh[which(mh$age == 3), "age"] <- 33
mh[which(mh$age == 323), "age"] <- 33

# Plot the distribution
ggplot(mh, aes(age)) + geom_histogram() + geom_text(stat='count',aes(label=..count..),vjust=-1) +  ggtitle("Distribution of Ages") + xlab("Age") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Frequency Distribution of Country Worked

# Plot the distribution
CW <- ggplot(mh,aes(country_work))
CW + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Country Worked") + xlab("Country Worked") + theme(axis.text = element_text(angle = 90, hjust = 1))

Frequency Distribution of Remote Work Or Not

# Plot the distribution
RW <- ggplot(mh,aes(remote_work))
RW + geom_bar() + geom_text(stat='count',aes(label=..count..),vjust=-1) + ggtitle("Distribution of Remote Work Or Not") + theme_bw()

Does Company Size matters?

# Group by currently have mental disorder
Company.size <- noofemployees %>% group_by(num_employees,currently_have_mental_disorder)
# Plot
ggplot(Company.size, aes(currently_have_mental_disorder)) + geom_bar() + facet_wrap(~num_employees, scales = "free_y") +
        ggtitle("How many workers Current have mental disorder in different size company?") +
        xlab("Currently with mental disorder?") +
        theme_bw()

# Calculate counts and frequencies
detach("package:plyr", unload=TRUE) 
## Warning: 'plyr' namespace cannot be unloaded:
##   namespace 'plyr' is imported by 'scales', 'ggplot2' so cannot be unloaded
library(dplyr)
freq1 <- Company.size %>%
  group_by(num_employees,currently_have_mental_disorder) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
# Plot Frequencies
pfreq1 <- ggplot(freq1, aes(x = num_employees, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") +  xlab("Company Size") + ylab("Frequency") 
ggplotly(pfreq1)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# Is there any relationship between Company Size and Mental Disorder?
# H0:NO
# H1:YES
chisq.test(table(noofemployees$num_employees,noofemployees$currently_have_mental_disorder))
## 
##  Pearson's Chi-squared test
## 
## data:  table(noofemployees$num_employees, noofemployees$currently_have_mental_disorder)
## X-squared = 15.15, df = 10, p-value = 0.1267

Based the result of chi-squared, reject H0, there’s no relationship between company size and mental disorder

Does tech company or not matters?

# Group by currently have mental disorder
Tech.company <- techcomp %>% group_by(sex,currently_have_mental_disorder)
# Plot
ggplot(Tech.company, aes(currently_have_mental_disorder)) + geom_bar() + facet_wrap(~tech_company, scales = "free_y") +
        ggtitle("How many employees Current have mental disorder in tech company") +
        xlab("Currently with mental disorder?") +
        theme_bw()

# Calculate counts and frequencies
freq2 <- Tech.company %>%
  group_by(sex,currently_have_mental_disorder) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
freq2
## # A tibble: 6 x 4
## # Groups:   sex [2]
##   sex    currently_have_mental_disorder     n  freq
##   <fct>  <fct>                          <int> <dbl>
## 1 female Maybe                             47 0.171
## 2 female No                                82 0.298
## 3 female Yes                              146 0.531
## 4 male   Maybe                            202 0.238
## 5 male   No                               359 0.423
## 6 male   Yes                              288 0.339
# Plot Frequencies
pfreq2 <- ggplot(freq2, aes(x = sex, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") +  xlab("Gender") + ylab("Frequency") 
ggplotly(pfreq2)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Female workers in tech company suffer more from mental disorder.

Does tech role or not matters?

# Group by currently have mental disorder
Tech.role <- primaryrole %>% group_by(sex,currently_have_mental_disorder)
# Plot
ggplot(Tech.role, aes(currently_have_mental_disorder)) + geom_bar() + facet_wrap(~sex, scales = "free_y") +
        ggtitle("How many tech workers current have mental disorder") +
        xlab("Currently with mental disorder?") +
        theme_bw()

# Calculate counts and frequencies
freq3 <- Tech.role %>%
  group_by(sex,currently_have_mental_disorder) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
freq3
## # A tibble: 6 x 4
## # Groups:   sex [2]
##   sex    currently_have_mental_disorder     n  freq
##   <fct>  <fct>                          <int> <dbl>
## 1 female Maybe                             11 0.157
## 2 female No                                23 0.329
## 3 female Yes                               36 0.514
## 4 male   Maybe                             47 0.249
## 5 male   No                                78 0.413
## 6 male   Yes                               64 0.339
# Plot Frequencies
pfreq3 <- ggplot(freq3, aes(x = sex, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") +  xlab("Gender") + ylab("Frequency") 
ggplotly(pfreq3)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

Female tech workers suffer more from mental disorder.

Does age matters?

# Replace 99 with median 33
mh[which(mh$age == 99), "age"] <- 33
# Group by currently have mental disorder
Age.Re <- mh %>% group_by(age,currently_have_mental_disorder)
# Calculate counts and frequencies
library(dplyr)
freq4 <- Age.Re %>%
  group_by(age,currently_have_mental_disorder) %>%
  summarise(n = n()) %>%
  mutate(freq = n / sum(n))
freq4
## # A tibble: 126 x 4
## # Groups:   age [50]
##      age currently_have_mental_disorder     n  freq
##    <dbl> <fct>                          <int> <dbl>
##  1  15.0 No                                 1 1.00 
##  2  17.0 No                                 1 1.00 
##  3  19.0 Maybe                              1 0.250
##  4  19.0 No                                 2 0.500
##  5  19.0 Yes                                1 0.250
##  6  20.0 Maybe                              2 0.333
##  7  20.0 No                                 3 0.500
##  8  20.0 Yes                                1 0.167
##  9  21.0 Maybe                              5 0.357
## 10  21.0 No                                 3 0.214
## # ... with 116 more rows
# Plot Frequencies
pfreq4 <- ggplot(freq4, aes(x = age, y = freq, fill = currently_have_mental_disorder)) + geom_bar(stat = "identity") +  xlab("Age") + ylab("Frequency") 
ggplotly(pfreq4)
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
# Is there any relationship between Company Size and Mental Disorder?
# H0:NO
# H1:YES
chisq.test(table(mh$age,mh$currently_have_mental_disorder))
## Warning in chisq.test(table(mh$age, mh$currently_have_mental_disorder)):
## Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  table(mh$age, mh$currently_have_mental_disorder)
## X-squared = 95.517, df = 98, p-value = 0.5522

Based the result of chi-squared, reject H0, there’s no relationship between age and mental disorder.